Please Refer to the Code Below and Instructions
Loading Libraries¶
import pandas as pd
import numpy as np
#Machine Learning
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import learning_curve
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, \
roc_curve, roc_auc_score, average_precision_score, auc, f1_score
#MatPlotLib
import matplotlib.pyplot as plt
import seaborn as sns
# pip install dash
# import dash
# import dash_core_components as dc# c
# import dash_html_components as ht# ml
# from dash.dependencies import Input, Output
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
Loading Functions¶
Data Preparation
Count the duplicated values for each column
def count_duplicated(df):
df_values = {}
for column in df.columns:
df_values[column] = df.duplicated(subset=[column]).sum()
df_values = pd.DataFrame(df_values, index=["Duplicated Values"])
return df_values
Compare the number of duplicated values with the number of unique values and the total number of values, for each column.
Create the interquartile range and the bounds for the interquartile rule.
After that, for each column computes the number of values outside the bounds
def check_duplicated_count(df, df_values):
#To confirm the result, we count the number of unique values in each column
df_values2 = {}
for column in df.columns:
df_values2[column] = df[column].nunique()
#And, then, we check if the results are the same for every variable
df_shape = df.shape
for column in df.columns:
if (int(df_values[column]) + int(df_values2[column]) - int(df_shape[0])) != 0 :
raise 'There is an error'
return 'Everthing is good!'
def check_for_outliers(df):
outliers = {}
for column in df.columns:
q1 = df[column].quantile(0.25)
q3 = df[column].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - (iqr * 1.5)
upper_bound = q3 + (iqr * 1.5)
outliers[column] = len(df) - df[column].between(lower_bound,upper_bound).sum() #Compute the difference between the total rows and the rows between the bounds
return pd.DataFrame(outliers, index=['Outliers'])
Data Visualization
def histogram_plots (df_good, df_fraud, features):
plt.figure(figsize=(25, 25))
for i in range(0,len(features)):
plt.subplot(11, 3, i+1)
ax=plt.hist(df_good[features[i]], density='TRUE', bins=100, label='Good transactions', alpha=0.5)
ax=plt.hist(df_fraud[features[i]], density='TRUE', bins=100, label='Fraud transactions', alpha=0.5)
plt.grid()
plt.xlabel(features[i]+" value")
plt.ylabel("Density")
plt.legend()
plt.tight_layout()
def boxplot_plots (df, target_column, features):
plt.figure(figsize=(25, 25))
for i in range(0,len(features)):
plt.subplot(10, 3, i+1)
ax = sns.boxplot(x=df[target_column], y=df[features[i]])
plt.ylabel(features[i]+' value')
plt.tight_layout()
plt.grid()
def violinplots (df, target_column, exclude_this_column):
features_v = df.copy()
features_v.set_index(target_column, inplace=True)
stacked = features_v.stack().reset_index(level=target_column).reset_index().iloc[1:]
stacked.columns = ['Variable',target_column,'Normalized_Value']
stacked = stacked[stacked['Variable']!= exclude_this_column][stacked['Variable']!= 'Amount']
print(stacked["Variable"])
fig = go.Figure()
fig.add_trace(go.Violin(x=stacked["Variable"][ stacked[target_column] == 1 ],
y=stacked["Normalized_Value"][ stacked[target_column] == 1 ],
legendgroup='Positive Class',
scalegroup='Yes',
name='Yes',
side='negative',
line_color='blue')
)
fig.add_trace(go.Violin(x=stacked["Variable"][ stacked[target_column] == 0 ],
y=stacked["Normalized_Value"][ stacked[target_column] == 0 ],
legendgroup='Negative Class',
scalegroup='No',
name='No',
side='positive',
line_color='orange')
)
fig.update_layout(
autosize=False,
width=3000,
height=600,
title='Interactive Violinplots'
)
fig.update_traces(meanline_visible=False)
fig.update_layout(violingap=0, violinmode = 'overlay')
fig.show()
Data Selection
Used to create the three dataset studied, they are different in the number of Class 0 events
def create_df(set_):
df_training=set_[0:round(0.7*len(set_))]
df_cv=set_[round(0.7*len(set_)):round(0.9*len(set_))]
df_test=set_[round(0.9*len(set_)):]
return(df_training,df_cv,df_test)
def print_info_df(df_training,df_cv,df_test):
print("The trainig dataset has: "+str(len(df_training))+" events, and there are "+str(len(df_training[df_training['Class']==1]))+" fraud events and "+str(len(df_training[df_training['Class']==0]))+" good events.")
print("The cv dataset has: "+str(len(df_cv))+" events, and there are "+str(len(df_cv[df_cv['Class']==1]))+" fraud events and "+str(len(df_cv[df_cv['Class']==0]))+" good events.")
print("The test dataset has: "+str(len(df_test))+" events, and there are "+str(len(df_test[df_test['Class']==1]))+" fraud events and "+str(len(df_test[df_test['Class']==0]))+" good events.\n")
def plot_heat_map(set_, features,):
heat_df = set_.loc[:, features ].corr()
if len(set_) == len(set_1):
heat= go.Heatmap( z = heat_df,x =features ,y =features, hoverongaps = False) #We plot the legend only one time
return heat
else:
heat= go.Heatmap( z = heat_df,x =features ,y =features, hoverongaps = False, showscale=False)
return heat
def heatmap_matrix (datasets, plot_titles, features):
fig = make_subplots(rows=1, cols=4, subplot_titles=plot_titles, shared_xaxes=True, shared_yaxes=True,)
fig.add_trace(plot_heat_map(datasets[1], features,), 1,1)
fig.add_trace(plot_heat_map(datasets[2], features ),1,2)
fig.add_trace(plot_heat_map(datasets[3], features ),1,3)
fig.add_trace(plot_heat_map(datasets[0], features ),1,4)
fig.update_layout(
autosize=False,
width=1400,
height=450,
title={'text':' Correlation matrix by Set',
'y':0.95,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'}
)
fig.show()
def barplots (variable_names, coefficients, title, y_axis_name):
fig = go.Figure()
fig.add_trace(go.Bar(x=variable_names,
y=coefficients[0].flatten(),
name='Set 1',
marker_color='#0892A5'
))
fig.add_trace(go.Bar(x=variable_names,
y=coefficients[1].flatten(),
name='Set 2',
marker_color='#A1E5AB'
))
fig.add_trace(go.Bar(x=variable_names,
y=coefficients[2].flatten(),
name='Set 3',
marker_color='#A23E48'
))
fig.update_layout(
title=title,
xaxis_tickfont_size=14,
yaxis=dict(
title=y_axis_name,
titlefont_size=16,
tickfont_size=14,
),
legend=dict(
x=0,
y=1.0,
bgcolor='rgba(255, 255, 255, 0)',
bordercolor='rgba(255, 255, 255, 0)'
),
barmode='group',
bargap=0.15, # gap between bars of adjacent location coordinates.
bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.show()
Machine Learning
In this function we summarize all the steps for train and evaluate the performace of a model starting from the datasets used for training, cross validation and test.
def model_evaluation(df_training,df_cv,df_test, features, model):
X_train=df_training.loc[:, features].to_numpy() #create the variables for the training
Y_train=df_training.loc[:, 'Class'].to_numpy() #separate the target variable used for the classification
X_cv=df_cv.loc[:, features].to_numpy()
Y_cv=df_cv.loc[:, 'Class'].to_numpy()
X_test=df_test.loc[:, features].to_numpy()
Y_test=df_test.loc[:, 'Class'].to_numpy()
model.fit(X_train,Y_train) #fit the model with the training data
#print the report for the train and cv sets
#print(classification_report(Y_train, model.predict(X_train)))
print(classification_report(Y_test, model.predict(X_test)))
Y_test_predict = model.predict(X_test)
Y_test_predict_proba = model.predict_proba(X_test)[:, 1]
#We use the average precision to evaluate the performance of the model trained on the test sample
print("Modello finale score (avg precision): " + str(average_precision_score(Y_test, Y_test_predict)))
return (X_train,Y_train,X_cv,Y_cv,X_test,Y_test,Y_test_predict_proba)
def calculate_plot_cv_LR(X_train,Y_train,X_cv,Y_cv,values):
train_scores=[]
cv_scores=[]
for value in values: #train and evaluate the score for every value in the list values
model=LogisticRegression(C=value,solver='liblinear',random_state=42, penalty='l2', class_weight='balanced', max_iter=1000)
model.fit(X_train, Y_train)
Y_train_predict = model.predict(X_train)
Y_cv_predict = model.predict(X_cv)
train_scores.append(average_precision_score(Y_train, Y_train_predict))
cv_scores.append(average_precision_score(Y_cv, Y_cv_predict))
plt.figure(figsize=(10,5))
plt.plot(C_values, train_scores, label='Training', marker='o', lw=0, ms=10)
plt.plot(C_values, cv_scores, label='Cross-Validation', marker='*', lw=0, ms=12)
plt.xlabel("C", fontsize=20)
plt.ylabel("average precision", fontsize=20)
plt.xlim(1e-9, 1e9)
plt.xscale('log')
plt.grid()
plt.legend(loc='best',fontsize=20)
plt.tick_params( length=2, width=2, grid_alpha=1, labelsize=20)
return (train_scores,cv_scores)
def precision_recall_plot (precision_list, recall_list, labels):
plt.figure(figsize=(10,6))
plt.plot(precision_list[0], recall_list[0], label=labels[0])
plt.plot(precision_list[1], recall_list[1], label=labels[1])
plt.plot(precision_list[2], recall_list[2], label=labels[2])
plt.xlabel('precision')
plt.ylabel('recall')
plt.legend()
plt.grid()
def plot_confusion_matrix(Y, X, model, ax):
cm = confusion_matrix(Y, model.predict(X))
#fig, ax = plt.subplots(figsize=(4, 4))
ax.imshow(cm)
ax.grid(False)
ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
ax.set_ylim(1.5, -0.5)
for i in range(2):
for j in range(2):
ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
def confusion_matrix_plots (Y, X, models, title):
fig,ax=plt.subplots(1,3, figsize=(18,8))
plot_confusion_matrix(Y[0], X[0], models[0], ax[0])
plot_confusion_matrix(Y[1], X[1], models[1], ax[1])
plot_confusion_matrix(Y[2], X[2], models[2], ax[2])
ax[0].title.set_text('Set 1')
ax[1].title.set_text('Set 2')
ax[2].title.set_text('Set 3')
plt.subplots_adjust(wspace=0.5)
fig.suptitle(title)
plt.show()
def plot_proba(model,X_test,ax,title):
predict_proba=model.predict_proba(X_test)
ax.hist(predict_proba[:,0], density=True, label='Fraud',alpha=0.5, bins=10)
ax.hist(predict_proba[:,1], density=True, label='Good', alpha=0.5, bins=10)
ax.set_xlabel('Predicted probabilities')
ax.set_ylabel('Counts')
ax.set_title(title)
ax.grid()
ax.legend()
def plot_learning_curves(model,title,ax,x_train,y_train):
train_size,train_scores,val_scores=learning_curve(model,x_train,y_train,cv=3)
train_scores_plot=np.mean(train_scores,axis=1)
val_scores_plot=np.mean(val_scores,axis=1)
ax.plot(train_size,train_scores_plot,marker='*',label='Train')
ax.plot(train_size,val_scores_plot,marker='.',label='Validation')
ax.set_xlabel('Size training sample')
ax.set_ylabel('Accuracy of the model')
ax.grid()
ax.legend()
ax.set_title('Learning Curves '+str(title))
Loading Dataset¶
Read the dataset in a cvs format and create the variable 'df', it will be used in all the future steps.
THE DATASET MUST BE IN THE SAME DIRECTORY AS THE NOTEBOOK
df=pd.read_csv("creditcard.csv")
Introduction to the DataSet¶
The dataset contains over 284,000 transactions between european cardholders in September 2013.
There are over 30 variables, most of them (28) are crypted. The only three columns not crypted are the time, the amount of the transaction and the target class.
Only 492 over 248,807 transaction, the 0.172%, are classified as frauds. Therefore, the dataset is highly unbalanced towards the negative class.
The main risks of an unbalanced dataset are overfitting and detecting the right correlations (we are limited by the PCA transformation in the majority of columns)
DATA SCRUBBING
In this section we will understand the quality and the integrity of the database. We will try to detect and remove not relevant data in order to improve the quality of the analysis and the prediction of the machine learning models deployed.
Missing Data
First, we check if there are any columns with missing data
if (df.isna().sum().sum() == 0) == True:
print("There are no missing data, we're lucky!")
else:
print("This need further investigation")
There are no missing data, we're lucky!
Finding Duplicates
Then, we check the number of duplicate rows in the dataset
if (df.duplicated().any()) == True:
duplicated_rows = df.duplicated().sum()
print ('There are {} rows duplicated'.format(str(duplicated_rows)))
There are 1081 rows duplicated
And we remove them.
print("Original Number of Rows:" + str(df.shape[0]))
df = df.drop_duplicates()
print("Filtered Number without Duplicated Rows:" + str(df.shape[0]))
Original Number of Rows:284807 Filtered Number without Duplicated Rows:283726
Next, we need to check if there are columns that have only one unique value. In that case, we can safely remove them (and simplify the computations) because they don't give any additional information.
df_values = count_duplicated(df)
df_values
| Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Duplicated Values | 159134 | 8063 | 8063 | 8063 | 8063 | 8063 | 8063 | 8063 | 8063 | 8063 | ... | 8063 | 8063 | 8063 | 8063 | 8063 | 8063 | 8063 | 8063 | 250959 | 283724 |
1 rows × 31 columns
check_duplicated_count(df, df_values)
'Everthing is good!'
Luckily, there is no feature with this characteristic
Detecting Outliers
Outliers can be very dangerous: even a single one (with very large value or a small one) can greatly reduce the precision of our models.
To identify them, we use the IQR Rule.
INTERQUARTILE RULE
- Calculate the interquartile range for the data
- Multiply the interquartile range (IQR) by 1.5 (a constant used to discern outliers).
- Upper_Bound: Add 1.5 x (IQR) to the third quartile. Any number greater than this is a suspected outlier.
- Lower_Bound: Subtract 1.5 x (IQR) from the first quartile. Any number less than this is a suspected outlier.
check_for_outliers(df)
| Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Outliers | 0 | 6948 | 13390 | 3306 | 11094 | 12221 | 22886 | 8839 | 23904 | 8199 | ... | 14401 | 1298 | 18467 | 4758 | 5333 | 5665 | 38799 | 30094 | 31685 | 473 |
1 rows × 31 columns
We can see that the features V27 and V28 have many outliers. So, we must decide if these variable are so important that we should remove around 15% of our database or we should simply ignore them.
Our preliminary study suggests us that the group of variables from V20 to V28 offer little to no information regarding the prediction of the target class. As a consequence, we decided to drop them.
We're ready to summarize what we've learned so far:
- The dataset is very unbalanced towards the negative class
- There are no missing values
- There were more than one thousand rows duplicated
- There are some features with many outliers (like V27 and V28). In the next steps we will remove the variable from 20 to 28 because they're poorly correlated with the target class so they offer minimal information with an high degree of risk.
You can see in the photo below the absolute values of weight of the coefficients in the logistic regression, method used in our preparation study. The higher the coefficient, the greater the discriminating power of the relative variable. So features between 20 and 28 are not very important related to V3 or V4 for example.
DATA EXPLORING¶
Describing the Dataset¶
First, we explore the dataset using the built-in pandas method .describe()
df.describe()
| Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 283726.000000 | 283726.000000 | 283726.000000 | 283726.000000 | 283726.000000 | 283726.000000 | 283726.000000 | 283726.000000 | 283726.000000 | 283726.000000 | ... | 283726.000000 | 283726.000000 | 283726.000000 | 283726.000000 | 283726.000000 | 283726.000000 | 283726.000000 | 283726.000000 | 283726.000000 | 283726.000000 |
| mean | 94811.077600 | 0.005917 | -0.004135 | 0.001613 | -0.002966 | 0.001828 | -0.001139 | 0.001801 | -0.000854 | -0.001596 | ... | -0.000371 | -0.000015 | 0.000198 | 0.000214 | -0.000232 | 0.000149 | 0.001763 | 0.000547 | 88.472687 | 0.001667 |
| std | 47481.047891 | 1.948026 | 1.646703 | 1.508682 | 1.414184 | 1.377008 | 1.331931 | 1.227664 | 1.179054 | 1.095492 | ... | 0.723909 | 0.724550 | 0.623702 | 0.605627 | 0.521220 | 0.482053 | 0.395744 | 0.328027 | 250.399437 | 0.040796 |
| min | 0.000000 | -56.407510 | -72.715728 | -48.325589 | -5.683171 | -113.743307 | -26.160506 | -43.557242 | -73.216718 | -13.434066 | ... | -34.830382 | -10.933144 | -44.807735 | -2.836627 | -10.295397 | -2.604551 | -22.565679 | -15.430084 | 0.000000 | 0.000000 |
| 25% | 54204.750000 | -0.915951 | -0.600321 | -0.889682 | -0.850134 | -0.689830 | -0.769031 | -0.552509 | -0.208828 | -0.644221 | ... | -0.228305 | -0.542700 | -0.161703 | -0.354453 | -0.317485 | -0.326763 | -0.070641 | -0.052818 | 5.600000 | 0.000000 |
| 50% | 84692.500000 | 0.020384 | 0.063949 | 0.179963 | -0.022248 | -0.053468 | -0.275168 | 0.040859 | 0.021898 | -0.052596 | ... | -0.029441 | 0.006675 | -0.011159 | 0.041016 | 0.016278 | -0.052172 | 0.001479 | 0.011288 | 22.000000 | 0.000000 |
| 75% | 139298.000000 | 1.316068 | 0.800283 | 1.026960 | 0.739647 | 0.612218 | 0.396792 | 0.570474 | 0.325704 | 0.595977 | ... | 0.186194 | 0.528245 | 0.147748 | 0.439738 | 0.350667 | 0.240261 | 0.091208 | 0.078276 | 77.510000 | 0.000000 |
| max | 172792.000000 | 2.454930 | 22.057729 | 9.382558 | 16.875344 | 34.801666 | 73.301626 | 120.589494 | 20.007208 | 15.594995 | ... | 27.202839 | 10.503090 | 22.528412 | 4.584549 | 7.519589 | 3.517346 | 31.612198 | 33.847808 | 25691.160000 | 1.000000 |
8 rows × 31 columns
Looking at the "Amount" column we can see that the dataset has a very wide range of transactions, starting from as little as 5.6€ and going over 25.000€ in at least one case.
We can also see that the average transaction is much more close to the minimum than to the maximum and the median transaction is 22€. Therefore, we can expect a positive skewness in the curve.
Data Visualization¶
In our dataset there are 2 types of classes, one is the fraudolent transiction (labeled as 1) the other is the good transaction class (labeled as 0). We will use two different dataset to explore the different charateristics among the classes and the features.
#Divide the main dataset (df) in others two to distinguish the good and fraud transactions
df_good=df[df['Class']==0]
df_fraud=df[df['Class']==1]
print("In the dataset there are "+str(len(df_good))+" good transactions against "+str(len(df_fraud))+" fraudolent transactions")
print("The percentage of fraud transactions agains the total is "+ str((round((len(df_fraud)*100/len(df)),3)))+"%")
In the dataset there are 283253 good transactions against 473 fraudolent transactions The percentage of fraud transactions agains the total is 0.167%
features=df.columns
features
Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
'Class'],
dtype='object')
df.head()
| Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | ... | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 149.62 | 0 |
| 1 | 0.0 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | ... | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 2.69 | 0 |
| 2 | 1.0 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | ... | 0.247998 | 0.771679 | 0.909412 | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 378.66 | 0 |
| 3 | 1.0 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | ... | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 123.50 | 0 |
| 4 | 2.0 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | ... | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 69.99 | 0 |
5 rows × 31 columns
In this section we report the distributions of our variables distincted among the two classes, this plots give us a general idea about the distributions. We will repeat the plot using the training data.
In our preliminary study, we have seen that these histograms (and the boxplots) weren't much descriptive of the data. So we have chosen instead to use them in the later sections of the notebook.
Histogram by Target Class
All the plots are interactive but is not possible to display that on kaggle so I selected some features and datasets. The code to make the plots interactive is commented into the cell code.
plot_this_feature='V3'
plt.figure(figsize=(23, 8))
ax=plt.hist(df_good[plot_this_feature], density='TRUE', bins=100, label='Good transactions', alpha=0.5)
ax=plt.hist(df_fraud[plot_this_feature], density='TRUE', bins=100, label='Fraud transactions', alpha=0.5)
plt.grid()
plt.title('Histogram of the Feature ' + str(plot_this_feature), fontsize = 22, pad= 25)
plt.xlabel(plot_this_feature+" value", fontsize = 18, labelpad=10)
plt.ylabel("Density", fontsize = 18, labelpad=10)
plt.legend(fontsize = 18)
plt.show()
We can start to see the first indicators of fraudolent transactions. For example, they tend to have higher V4 and V11 values
We're ready to summarize what we've learned until now:
- The "amount" column is highly unbalanced
- Variables V4 and V11 seem somewhat useful in detecting frauds
- The amount and the time don't seem to correlate very well with the target class
- We can not really say anything about the distribution of the variables because they have not real meaning (mathematical of physical).
DATA MODELLING¶
Data Selection
In this section we will scale the features that are not scaled (Time and Amount) and we will create the dataset that we will use for the training, cross-validation and test pf our classification models. We have to decide which features we will use, which kind of data and the composition of our dataset. A general idea could be:
Set 1 ⟶ 50% Class 1 + 50% Class 0
Set 2 ⟶ 25% Class 1 + 75% Class 0
Set 3 ⟶ 10% Class 1 + 90% Class 0
Thet we will split these 3 sets in 3 different subset each for the training (+cv) and test part (90%+10%). During the project we will study the performance of the model changing the training sample to understand which one gives the best perfomance.
In order to have a consistent story-telling, we will set a random state seed so the "random shuffling" will return each time the same outputs.
random_state_seed=42 #"The answer"
fraud_len=len(df_fraud)
#This set is composed by the same number of fraud and good transactions
set_1=pd.concat([df_fraud.sample(frac=1, random_state=random_state_seed), df_good.sample(frac=1, random_state=random_state_seed)[:fraud_len]], axis=0)
#This set is slightly unbalanced towards the negative class
set_2=pd.concat([df_fraud.sample(frac=1, random_state=random_state_seed), df_good.sample(frac=1, random_state=random_state_seed)[:int(fraud_len*3)]])
#This set is heavily unbalanced towards the negative class
set_3=pd.concat([df_fraud.sample(frac=1, random_state=random_state_seed), df_good.sample(frac=1, random_state=random_state_seed)[:int(fraud_len*9)]])
Standardizing the features using a standard deviation is important when we compare measurements that have different units. Variables that are measured at different scales do not contribute equally to the analysis and might end up creating a bias.
For example, a variable that ranges between 0 and 1000 will outweight a variable that ranges between 0 and 1. Using these variables without standardization will give the variable with the larger range weight of 1000 in the analysis. Transforming the data to comparable scales can prevent this problem. Typical data standardization procedures equalize the range and/or data variability.
In our case, we have standardized all the columns minus the target class using the scikitlearn integrated functions MinMaxScaler and fit_transform
#Standardize the values of all the variables in the dataset except for the target one
for elem in (set_1,set_2,set_3):
scaler=MinMaxScaler((-1,1))
for column in elem.columns:
if column!='Class': #target class
#scaler= StandardScaler()
elem[column]=scaler.fit_transform(elem[[column]])
Now we are ready for some exploring!
In the two cell below you can see how differently boxplots and violinplots supplies information to the viewer.
Clearly, violinplots bear more information at a glance, but they're certainly a more difficult to read. They're more suited for features were minimal differences in the distribution may be important.
On the other hand, the boxplots give a clear view of the main statistical measures (quartiles, iqr and outliers) while tossing away the kernel distribution of the feature.
Box Plot by Variable
use_this_set=set_1
plot_this_feature = 'V5'
plt.figure(figsize=(25, 8))
ax = sns.boxplot(x=use_this_set['Class'], y=use_this_set[plot_this_feature])
plt.ylabel(plot_this_feature+' value', fontsize = 18, labelpad=10)
plt.xlabel('Class', fontsize = 18, labelpad=10)
plt.title('Boxplot of the Feature ' + str(plot_this_feature), fontsize = 22, pad= 25)
plt.show()
Next, we can explore the relations between the different features of the dataset.
We have plotted correlation matrices using heatmaps. Below, you can see an heatmap for every set we've generated at the start of the session and an heatmap for the original dataset.
Note that the matrices look very similar.
This makes sense.
The 3 sets were random sampled from the original one, we would have a problem if one of the matrices looked much different from the others.
heatmap_matrix(datasets = [df, set_1, set_2, set_3], plot_titles = ["Set 1 (50/50)", "Set 2 (25/75)" , "Set 3 (10/90)", "Total Dataset"], features = features)
The plots gives us addictional information about the variables that influence the target class. We can see, as we thought, that V4, V11 and V18 are strongly correlated in some way with a fraudlent transaction. But there are many more indicators that show up (like V12 and V14), this means that our model can be more precise.
From these plots we can uderstand how much is important the data selection, if we didn't, we would have less correlation between the Class and the features, as the third plot shows. For example the correlation between the 'Class' and V4 is 0.70 for the set 1 but is only 0.13 for the whole dataset.
Machine Learning¶
In this section we will discuss about machine learning application, in particular this project is about a classification task. We want to create a model that can classify the different classes, in this case fraud or good trasactions. We will try a linear model, the logistic regression on different datasets (set1,set2 and set3) and a Random Forest, a more complex model.
The dataset will be divided into three parts: one for the training, one for the cross validation (to avoid overfitting) and the latter for the test or rather to calculate the performance of the model.
This is a very unbalanced dataset so we must be careful about the metric used to calculate the perfomance, for example accuracy is a bad one, so we will use the average precision. This metrics calculates the area under the precision recall curve (approximately).
We will use other matrices like ROC curves, precision-recall curve or other times of score.
We will use the models available on scikit-learn and we will find the best values for their hyperarameters.
The datasets that we will create are:
df_training ⟶ 70% tot
df_cv ⟶ 20% tot
df_test ⟶ 10% tot
set_1=set_1.sample(frac=1, random_state=random_state_seed)
set_2=set_2.sample(frac=1, random_state=random_state_seed)
set_3=set_3.sample(frac=1, random_state=random_state_seed)
(df_training1,df_cv1,df_test1)=create_df(set_1)
print_info_df(df_training1,df_cv1,df_test1)
(df_training2,df_cv2,df_test2)=create_df(set_2)
print_info_df(df_training2,df_cv2,df_test2)
(df_training3,df_cv3,df_test3)=create_df(set_3)
print_info_df(df_training3,df_cv3,df_test3)
The trainig dataset has: 662 events, and there are 342 fraud events and 320 good events. The cv dataset has: 189 events, and there are 80 fraud events and 109 good events. The test dataset has: 95 events, and there are 51 fraud events and 44 good events. The trainig dataset has: 1324 events, and there are 348 fraud events and 976 good events. The cv dataset has: 379 events, and there are 78 fraud events and 301 good events. The test dataset has: 189 events, and there are 47 fraud events and 142 good events. The trainig dataset has: 3311 events, and there are 340 fraud events and 2971 good events. The cv dataset has: 946 events, and there are 99 fraud events and 847 good events. The test dataset has: 473 events, and there are 34 fraud events and 439 good events.
Logistic Regression¶
What is a Logistic Regression Model?
The LR Model is a predictive model used mainly to study phenomenons with a binary event outcome (fail/pass, fraud/no fraud, 0/1, and so on) The model has a categorical dependent variable with two possible values (like 0 and 1). The logarithm of the odds is computed using a linear combination of one or more independent variables, called "predictors".
How we set up our Logistic Regression Model
We first define the train sets, the cross validation sets and the test sets.
Then, we inizialize the Logistic Regressor of SkLearn using the liblinear solver (ideal for "small" dataset like this one) without specifing nothing about the weight of the classes.
Finally, we evaluate the model using different methods.
#We add to the new dataframe the "amount" and "time" scaled columns, removing the old ones
features_train=features.drop('Class')
features_train=['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'Amount']
Cross Validation¶
C_values=np.array([1.e-08, 1.e-07, 1.e-06,1.e-05, 1.e-04,1.e-03, 1.e-02,1.e-01, 1.e+00,1.e+01, 1.e+02,1.e+03, 1.e+04,1.e+05, 1.e+06,1.e+07,
1.e+08,1.e+09, 1.e+10])
print('Logistic Regression with set 1: ')
model_LR1=LogisticRegression(solver='liblinear',random_state=random_state_seed, penalty='l2', class_weight='balanced',C= 1.e-04)
(X_train1,Y_train1,X_cv1,Y_cv1,X_test1,Y_test1,Y_test_predict_proba1)=model_evaluation(df_training1,df_cv1,df_test1, features_train, model_LR1)
print('===========================================\n')
print('################ Logistic Regression with set 2: ################')
model_LR2=LogisticRegression(solver='liblinear',random_state=random_state_seed, penalty='l2', class_weight='balanced',C= 1.e-04)
(X_train2,Y_train2,X_cv2,Y_cv2,X_test2,Y_test2,Y_test_predict_proba2)=model_evaluation(df_training2,df_cv2,df_test2, features_train, model_LR2)
print('===========================================\n')
print('################ Logistic Regression with set 3: ################')
model_LR3=LogisticRegression(solver='liblinear',random_state=random_state_seed, penalty='l2', class_weight='balanced',C= 1.e-04)
(X_train3,Y_train3,X_cv3,Y_cv3,X_test3,Y_test3,Y_test_predict_proba3)=model_evaluation(df_training3,df_cv3,df_test3, features_train, model_LR3)
Logistic Regression with set 1:
precision recall f1-score support
0 0.54 1.00 0.70 44
1 1.00 0.27 0.43 51
accuracy 0.61 95
macro avg 0.77 0.64 0.57 95
weighted avg 0.79 0.61 0.56 95
Modello finale score (avg precision): 0.663983488132095
===========================================
################ Logistic Regression with set 2: ################
precision recall f1-score support
0 0.81 1.00 0.89 142
1 1.00 0.28 0.43 47
accuracy 0.82 189
macro avg 0.90 0.64 0.66 189
weighted avg 0.85 0.82 0.78 189
Modello finale score (avg precision): 0.4564899245750309
===========================================
################ Logistic Regression with set 3: ################
precision recall f1-score support
0 0.96 1.00 0.98 439
1 1.00 0.44 0.61 34
accuracy 0.96 473
macro avg 0.98 0.72 0.80 473
weighted avg 0.96 0.96 0.95 473
Modello finale score (avg precision): 0.4813456037806243
We can take a look inside our model using the .coef attribute of the regressor.
We can display the "contribution" that each variable give in the evaluation of the target class.
coef1=np.abs(model_LR1.coef_)
coef2=np.abs(model_LR2.coef_)
coef3=np.abs(model_LR3.coef_)
Variable Weights
set_1 = "Yes" #@param ["Yes", "No"]
set_2 = "Yes" #@param ["Yes", "No"]
set_3 = "Yes" #@param ["Yes", "No"]
import plotly.graph_objects as go
variable_names = features_train
fig = go.Figure()
if set_1 == 'Yes':
fig.add_trace(go.Bar(x=variable_names,
y=coef1.flatten(),
name='Set 1',
marker_color='#0892A5'
))
if set_2 == 'Yes':
fig.add_trace(go.Bar(x=variable_names,
y=coef2.flatten(),
name='Set 2',
marker_color='#A23E48'
))
if set_3 == 'Yes':
fig.add_trace(go.Bar(x=variable_names,
y=coef3.flatten(),
name='Set 3',
marker_color='#A1E5AB'
))
fig.update_layout(
title='Coefficient Plot of the Set',
xaxis_tickfont_size=14,
yaxis=dict(
title='Weight',
titlefont_size=16,
tickfont_size=14,
),
legend=dict(
x=0,
y=1.0,
bgcolor='rgba(255, 255, 255, 0)',
bordercolor='rgba(255, 255, 255, 0)'
),
barmode='group',
bargap=0.15, # gap between bars of adjacent location coordinates.
bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.show()
Model Evaluation¶
We evaluate the results of the model in several ways.
First, we use a confusion matrix. It's a matrix with four quadrant, one for each combination of class and prediction.
Then we compute the score of the model using the dedicated sklearn function Average_precision_score
Finally, we plot the precision/recall curve which show the tradeoff between these two measure. A very useful indicator when the classes are unbalanced.
print('Recall LR1: ' +str(round(metrics.recall_score(Y_test1,model_LR1.predict(X_test1)),3)))
print('Recall LR2: ' +str(round(metrics.recall_score(Y_test2,model_LR2.predict(X_test2)),3)))
print('Recall LR3: ' +str(round(metrics.recall_score(Y_test3,model_LR3.predict(X_test3)),3)))
Recall LR1: 0.275 Recall LR2: 0.277 Recall LR3: 0.441
fig,ax=plt.subplots(1,3,figsize=(20,6))
plot_proba(model_LR1,X_test1,ax[0],'Model LR1')
plot_proba(model_LR2,X_test2,ax[1],'Model LR2')
plot_proba(model_LR3,X_test3,ax[2],'Model LR3')
fig.suptitle('Predicted probabilities by the model on the test sample')
Text(0.5, 0.98, 'Predicted probabilities by the model on the test sample')
confusion_matrix_plots (Y = [Y_test1, Y_test2, Y_test3], X = [X_test1, X_test2, X_test3], models = [model_LR1, model_LR2, model_LR3],title='Confusion matrixes for the Logistic Regression models')
The classification report gives us a numerical evaluation of the model:
- The first column, "Precision", is defined as ratio between True Positives and (True Positives + False Positives)
- The second column, "Recall", is defined as the ratio between True Positives and (True Positives + False Negatives)
- The third column, "F1-Score", is the harmonic mean of the precedent measures. It's computed as 2 x (Precision x Recall) / (Precision + Recall)
Finally, we display the tradeoff between recall and precision mentioned before.
As we can see, our model sligthly favors recall against precision.
precision1, recall1, thresholds1 = precision_recall_curve(Y_test1, Y_test_predict_proba1)
precision2, recall2, thresholds2 = precision_recall_curve(Y_test2, Y_test_predict_proba2)
precision3, recall3, thresholds3 = precision_recall_curve(Y_test3, Y_test_predict_proba3)
precision_recall_plot(precision_list = [precision1, precision2, precision3], recall_list = [recall1, recall2, recall3], labels = ["Set1", "Set2", "Set3"])
plt.title('Logistic Regression Precision Recall curves by Set')
Text(0.5, 1.0, 'Logistic Regression Precision Recall curves by Set')
print("Modello 1 (AUC_ROC): " + str(metrics.roc_auc_score(Y_test1, Y_test_predict_proba1)))
print("Modello 2 (AUC_ROC): " + str(metrics.roc_auc_score(Y_test2, Y_test_predict_proba2)))
print("Modello 3 (AUC_ROC): " + str(metrics.roc_auc_score(Y_test3, Y_test_predict_proba3)))
Modello 1 (AUC_ROC): 0.9964349376114081 Modello 2 (AUC_ROC): 0.9605933473179503 Modello 3 (AUC_ROC): 0.9855286077984724
print("Modello 1 (AUC_PR): " + str(metrics.average_precision_score(Y_test1, Y_test_predict_proba1)))
print("Modello 2 (AUC_PR): " + str(metrics.average_precision_score(Y_test2, Y_test_predict_proba2)))
print("Modello 3 (AUC_PR): " + str(metrics.average_precision_score(Y_test3, Y_test_predict_proba3)))
Modello 1 (AUC_PR): 0.9969886938732714 Modello 2 (AUC_PR): 0.9382764135265615 Modello 3 (AUC_PR): 0.9598775150665023
Random Forest¶
What is a Random Forest Model?
Random forest is a flexible, easy to use machine learning algorithm that produces, even without hyper-parameter tuning, a great result most of the time. It is also one of the most used algorithms, because of its simplicity and diversity (it can be used for both classification and regression tasks).
A Random Forest model builds multiple decision trees and merges them together to get a more accurate and stable prediction.
Instead of searching for the most important feature while splitting a node, the random forest searches for the best feature among a random subset of features. This results in a wide diversity that generally results in a better model. Therefore, in random forest, only a random subset of the features is taken into consideration by the algorithm for splitting a node.
How we set up our Random Forest Model
We've used the integrated function of scikitlearn called "RandomForestClassifier" on the three datasets created at the start of the section ("df_training_1", "df_cv1", "df_test1"). We repeat this operation for each set.
In order to increase the efficiency of our model, we setted up a grid search (a function that compute the outputs while trying different combinations of hyperparameters) and used the first ranked combination.
from sklearn.ensemble import RandomForestClassifier
print('################ Random forest with set 1: ################')
model_RF1=RandomForestClassifier(n_estimators=100)
(X_train1,Y_train1,X_cv1,Y_cv1,X_test1,Y_test1,YRF_test_predict_proba1)=model_evaluation(df_training1,df_cv1,df_test1, features_train, model_RF1)
print('===========================================\n')
print('################ Random forest with set2: ################')
model_RF2=RandomForestClassifier(n_estimators=100)
(X_train2,Y_train2,X_cv2,Y_cv2,X_test2,Y_test2,YRF_test_predict_proba2)=model_evaluation(df_training2,df_cv2,df_test2, features_train, model_RF2)
print('===========================================\n')
print('################ Random forest with set3: ################')
model_RF3=RandomForestClassifier(n_estimators=100)
(X_train3,Y_train3,X_cv3,Y_cv3,X_test3,Y_test3,YRF_test_predict_proba3)=model_evaluation(df_training3,df_cv3,df_test3, features_train, model_RF3)
################ Random forest with set 1: ################
precision recall f1-score support
0 0.96 1.00 0.98 44
1 1.00 0.96 0.98 51
accuracy 0.98 95
macro avg 0.98 0.98 0.98 95
weighted avg 0.98 0.98 0.98 95
Modello finale score (avg precision): 0.9818369453044375
===========================================
################ Random forest with set2: ################
precision recall f1-score support
0 0.96 0.98 0.97 142
1 0.93 0.87 0.90 47
accuracy 0.95 189
macro avg 0.95 0.93 0.93 189
weighted avg 0.95 0.95 0.95 189
Modello finale score (avg precision): 0.8446087009916797
===========================================
################ Random forest with set3: ################
precision recall f1-score support
0 1.00 1.00 1.00 439
1 0.94 0.94 0.94 34
accuracy 0.99 473
macro avg 0.97 0.97 0.97 473
weighted avg 0.99 0.99 0.99 473
Modello finale score (avg precision): 0.8900414785986526
In this section we want to have a look into the grid search method to looking for the best parameters for the model. Found the best classifier goes further the aim of this project so is it not investigated more. We will use the classifier with the predefinite parameters.
Cross Validation (grid search)¶
from sklearn.model_selection import GridSearchCV
param_grid = {
'bootstrap': [True],
'max_depth': [80, 90, 100, 110],
'max_features': [2, 3],
'min_samples_leaf': [3, 4, 5],
'min_samples_split': [8, 10, 12],
'n_estimators': [100, 200, 300, 1000]
}
rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
cv = 3, n_jobs = -1, verbose = 2)
################################################################################################
################################################################################################
################################################################################################
############################## ATTENTION ##################################
################################################################################################
################################################################################################
################################################################################################
#Just a fast application of the grid search to looking for the best parameters
#Around 10 min to compile this section, if you have no time please go on the next cells
#grid_search.fit(X_train1, Y_train1)
#grid_search.best_params_
#df_grid_search=pd.DataFrame(grid_search.cv_results_)
#df_grid_search.sort_values('rank_test_score').head()
Model Evaluation¶
confusion_matrix_plots (Y = [Y_test1, Y_test2, Y_test3], X = [X_test1, X_test2, X_test3], models = [model_RF1, model_RF2, model_RF3],title='Confusion matrixes for the Random Forest models')
coef1_RF=np.abs(model_RF1.feature_importances_)
coef2_RF=np.abs(model_RF2.feature_importances_)
coef3_RF=np.abs(model_RF3.feature_importances_)
barplots (variable_names = features_train, coefficients = [coef1_RF, coef2_RF, coef3_RF], title = 'Coefficient Plot for the 3 Sets infor the Random Forest models', y_axis_name = 'Weight')
precision1RF, recall1RF, thresholds1RF = precision_recall_curve(Y_test1, YRF_test_predict_proba1)
precision2RF, recall2RF, thresholds2RF = precision_recall_curve(Y_test2, YRF_test_predict_proba2)
precision3RF, recall3RF, thresholds3RF = precision_recall_curve(Y_test3, YRF_test_predict_proba3)
precision_recall_plot(precision_list=[precision1RF, precision2RF, precision3RF], recall_list=[recall1RF, recall2RF, recall3RF], labels=["Set1", "Set2", "Set3"])
plt.title('Random Forest Precision Recall curves by Set')
Text(0.5, 1.0, 'Random Forest Precision Recall curves by Set')
print('Recall RF1: ' +str(round(metrics.recall_score(Y_test1,model_RF1.predict(X_test1)),3)))
print('Recall RF2: ' +str(round(metrics.recall_score(Y_test2,model_RF2.predict(X_test2)),3)))
print('Recall RF3: ' +str(round(metrics.recall_score(Y_test3,model_RF3.predict(X_test3)),3)))
Recall RF1: 0.961 Recall RF2: 0.872 Recall RF3: 0.941
Learning curves¶
fig,ax=plt.subplots(3,1,figsize=(15,15))
plot_learning_curves(model_RF1,'Random Forest 1',ax[0], X_train1,Y_train1)
plot_learning_curves(model_RF1,'Random Forest 2',ax[1], X_train2,Y_train2)
plot_learning_curves(model_RF1,'Random Forest 3',ax[2], X_train3,Y_train3)
DATA INTERPRETATION¶
In this section we will conclude our studies on data visualitation and machine learning models applied to a classification task with an high unbalanced dataset. We will discuss the main different among the datasets and the models used and their perfomance on the test sample.
As we've said in the introduction, the dataset is strongly unbalanced towards the negative class (legit transactions).
Futrthermore, the values of the features aren't very relevant, considering the PCA-transformation.
Creating a predictive model with this type of dataset is very difficult and can result in overfitting.
Data Selection
Our goal was to create an easily scalable model while avoiding overfitting. To do that we have designed tre scenarios:
- One with a balanced dataset (50% of legit transactions and 50% of fraud transactions)
- One slighlty unbalanced (75% of legit transactions and 25% of fraud transactions)
- One heavily unbalanced (90% of legit transactions and 10% of fraud transactions)
After that, we've removed the variables that weren't strong predictors of the target class (from feature V20 up to the feature V28).
We've computed the relation between the variables and the target class using a correlation matrix and an heatmap to visualize that.
You can see from the heatmaps that increasingly unbalanced datasets lead to weaker relations between the variables and the target class.
Models Comparison
For the actual machine learning part, we've chosen a simple yet powerful model called "Logistic Regressor" that has yielded good results. Even with the third dataset, the most unbalanced one, the model still had good performances.
The results are consistent in the three models, as displayed by the interactive barplot in the notebook, because the coefficients are proportional to the dataset.
Lastly, we've used a more complex model, called "Random Forest", to have a term of comparison with the Logistic Regressor.
In order to tune the hyperparameters at their best, we've used a simple grid search over the most important features (it wasn't feasibile doing that for every feature because of the computational cost of the operation).
The Random Forest model showed far better results than the Logistic Regressor.
From the learning curve it's possible to see that increasing the sample size leads to better performances, although narrowly.
As a final note, we should be aware that the we have few data for the test part. So, the confusion matrix and the precision/recall plots aren't very reliable, they gives us just a general idea of the performance.
In conclusion, we can say that the best models are the Logistic Regression and the Random Forest applied over the third set.
Thanks for the reading,
Luca Pessina e Gabriele Carbone
APPENDIX¶
In this short section we will post all the non essential material, it could be used to better udestand some parts of the previuos sections.
It is possible to see the plot obtained from the cross validation on the parameter C for the logistic regression. Are shown the validation and training scores and the relative value of C. The datasets are very small and we select the data in a random way so these graphs don't give us a good information. It is possible to see the high overfitting status of the model for values of C>1 and underfitting status for C< 1 given by the hight regularitation.
train_scores1,cv_scores1=calculate_plot_cv_LR(X_train1,Y_train1,X_cv1,Y_cv1,C_values)
plt.title('Model LR1')
Text(0.5, 1.0, 'Model LR1')
train_scores2,cv_scores2=calculate_plot_cv_LR(X_train2,Y_train2,X_cv2,Y_cv2,C_values)
plt.title('Model LR2')
Text(0.5, 1.0, 'Model LR2')
train_scores3,cv_scores3=calculate_plot_cv_LR(X_train3,Y_train3,X_cv3,Y_cv3,C_values)
plt.title('Model LR3')
Text(0.5, 1.0, 'Model LR3')